Esse notebook R tem como finalidade conhecer e exercitar o conjunto de pacotes tidymodel, que seria uma evolução do pacote caret para transformação e fitting de dados.
Reproduz ou se baseia nos seguintes tutoriais da internet:
set.seed(42)
options(max.print = 150)
library(modeldata)
library(tidymodels)
library(tidyverse)
library(caret)
library(magrittr)
library(naniar)
library(furrr)
library(skimr)
library(vip)
library(workflows)
library(tune)
plan(multicore)
data("credit_data")
credit_data <- credit_data %>% set_names(tolower(names(.)))
glimpse(credit_data)
Rows: 4,454
Columns: 14
$ status <fct> good, good, bad, good, good, good, good, good, good, bad, good, good, good, good, ...
$ seniority <int> 9, 17, 10, 0, 0, 1, 29, 9, 0, 0, 6, 7, 8, 19, 0, 0, 15, 33, 0, 1, 2, 5, 1, 27, 26,...
$ home <fct> rent, rent, owner, rent, rent, owner, owner, parents, owner, parents, owner, owner...
$ time <int> 60, 60, 36, 60, 36, 60, 60, 12, 60, 48, 48, 36, 60, 36, 18, 24, 24, 24, 48, 60, 60...
$ age <int> 30, 58, 46, 24, 26, 36, 44, 27, 32, 41, 34, 29, 30, 37, 21, 68, 52, 68, 36, 31, 25...
$ marital <fct> married, widow, married, single, single, married, married, single, married, marrie...
$ records <fct> no, no, yes, no, no, no, no, no, no, no, no, no, no, no, yes, no, no, no, no, no, ...
$ job <fct> freelance, fixed, freelance, fixed, fixed, fixed, fixed, fixed, freelance, partime...
$ expenses <int> 73, 48, 90, 63, 46, 75, 75, 35, 90, 90, 60, 60, 75, 75, 35, 75, 35, 65, 45, 35, 46...
$ income <int> 129, 131, 200, 182, 107, 214, 125, 80, 107, 80, 125, 121, 199, 170, 50, 131, 330, ...
$ assets <int> 0, 0, 3000, 2500, 0, 3500, 10000, 0, 15000, 0, 4000, 3000, 5000, 3500, 0, 4162, 16...
$ debt <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2500, 260, 0, 0, 0, 2000, 0, 0, 0, 0, 500, 0, ...
$ amount <int> 800, 1000, 2000, 900, 310, 650, 1600, 200, 1200, 1200, 1150, 650, 1500, 600, 400, ...
$ price <int> 846, 1658, 2985, 1325, 910, 1645, 1800, 1093, 1957, 1468, 1577, 915, 1650, 940, 50...
naniar e avalinado
credit_data %>%
miss_var_summary()
NA
credit_data %>% skim()
-- Data Summary ------------------------
Values
Name Piped data
Number of rows 4454
Number of columns 14
_______________________
Column type frequency:
factor 5
numeric 9
________________________
Group variables None
-- Variable type: factor --------------------------------------------------------------------------------
# A tibble: 5 x 6
skim_variable n_missing complete_rate ordered n_unique top_counts
* <chr> <int> <dbl> <lgl> <int> <chr>
1 status 0 1 FALSE 2 goo: 3200, bad: 1254
2 home 6 0.999 FALSE 6 own: 2107, ren: 973, par: 783, oth: 319
3 marital 1 1.00 FALSE 5 mar: 3241, sin: 977, sep: 130, wid: 67
4 records 0 1 FALSE 2 no: 3681, yes: 773
5 job 2 1.00 FALSE 4 fix: 2805, fre: 1024, par: 452, oth: 171
-- Variable type: numeric -------------------------------------------------------------------------------
# A tibble: 9 x 11
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
* <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 seniority 0 1 7.99 8.17 0 2 5 12 48 ▇▃▁▁▁
2 time 0 1 46.4 14.7 6 36 48 60 72 ▁▂▅▃▇
3 age 0 1 37.1 11.0 18 28 36 45 68 ▆▇▆▃▁
4 expenses 0 1 55.6 19.5 35 35 51 72 180 ▇▃▁▁▁
5 income 381 0.914 142. 80.7 6 90 125 170 959 ▇▂▁▁▁
6 assets 47 0.989 5404. 11574. 0 0 3000 6000 300000 ▇▁▁▁▁
7 debt 18 0.996 343. 1246. 0 0 0 0 30000 ▇▁▁▁▁
8 amount 0 1 1039. 475. 100 700 1000 1300 5000 ▇▆▁▁▁
9 price 0 1 1463. 628. 105 1117. 1400 1692. 11140 ▇▁▁▁▁
table(credit_data$status)
bad good
1254 3200
round(prop.table(table(credit_data$status)),2)
bad good
0.28 0.72
split <- rsample::initial_split(credit_data, prop = .8, strata = "status")
df_train <- training(split)
df_test <- testing(split)
train_cv <- rsample::vfold_cv(df_train, v=5, strata = "status")
train_cv_caret <- rsample2caret(train_cv)
In this particular example I mainly focus on imputting missing data or assigning them a new categorical level, infrequent/ unobserved values and hot-encoding them.
my_recipe <- df_train %>%
recipe(status~.) %>%
# imputation: add "unknown" to all missing factor values
step_unknown(all_nominal(), -status) %>%
# imputation: add median to all missing numeric values
step_medianimpute(all_numeric()) %>%
# compining: group factors below 5% of frequency (default) in an "infrequent_combined"
step_other(all_nominal(), -status, other = "infrequent_combined") %>%
# create a "level" in the factor columns for unseen factors
step_novel(all_nominal(), -status, new_level = "unrecorded_observation") %>%
# OHE
step_dummy(all_nominal(), -status, one_hot=T)
my_recipe
Data Recipe
Inputs:
Operations:
Unknown factor level assignment for all_nominal, -, status
Median Imputation for all_numeric
Collapsing factor levels for all_nominal, -, status
Novel factor level assignment for all_nominal, -, status
Dummy variables from all_nominal, -, status
my_recipe_prep <- prep(my_recipe, retain=T)
my_recipe_prep
Data Recipe
Inputs:
Training data contained 3565 data points and 335 incomplete rows.
Operations:
Unknown factor level assignment for home, marital, records, job [trained]
Median Imputation for seniority, time, age, expenses, income, assets, debt, amount, price [trained]
Collapsing factor levels for home, marital, records, job [trained]
Novel factor level assignment for home, marital, records, job [trained]
Dummy variables from home, marital, records, job [trained]
tidy(my_recipe_prep)
NA
# some data
mydata <- tibble(
class = as.factor(c("dog","dog","dog","cat","cat","monkey")),
size = c(53,42,83,20,30,70),
weight = c(15,8,12,5,6,21)
)
# create a recipe from "mydata"
rcp <- recipe(mydata) %>%
# one hot encoding fo class
step_dummy(all_nominal(), one_hot = T)
# prepare the recipe
p_rcp <- prep(rcp, verbose = T)
oper 1 step dummy [training]
The retained training set is ~ 0 Mb in memory.
# apply a prepared recipe to a new data
# in this case do OHE
bake(p_rcp, mydata)
NA
control_caret <- trainControl(
method="cv",
verboseIter = F,
classProbs = T,
summaryFunction = twoClassSummary,
returnResamp = "final",
savePredictions = "final",
index = train_cv_caret$index,
indexOut = train_cv_caret$indexOut
)
grid_caret <- expand.grid(
mtry = seq(1,ncol(df_train)-1,3),
splitrule = c("extratrees","gini"),
min.node.size=c(1,3,5)
)
model_caret <- train(
status~.,
data=juice(my_recipe_prep),
method="ranger",
metric="ROC",
trControl=control_caret,
tuneGrid = grid_caret,
importance="impurity",
num.tree=500
)
print(model_caret)
Random Forest
3565 samples
29 predictor
2 classes: 'bad', 'good'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2851, 2852, 2852, 2852, 2853
Resampling results across tuning parameters:
mtry splitrule min.node.size ROC Sens Spec
1 extratrees 1 0.7903241 0.0000000000 1.0000000
1 extratrees 3 0.7891645 0.0000000000 1.0000000
1 extratrees 5 0.7924344 0.0000000000 1.0000000
1 gini 1 0.8198287 0.0000000000 1.0000000
1 gini 3 0.8181632 0.0009950249 1.0000000
1 gini 5 0.8186128 0.0019950249 1.0000000
4 extratrees 1 0.8063489 0.4432786070 0.9219032
4 extratrees 3 0.8073462 0.4373084577 0.9187835
4 extratrees 5 0.8071579 0.4413034826 0.9230750
4 gini 1 0.8352239 0.4661791045 0.9273696
4 gini 3 0.8340549 0.4641940299 0.9254173
4 gini 5 0.8339762 0.4771293532 0.9258087
7 extratrees 1 0.8096946 0.4741542289 0.9078490
7 extratrees 3 0.8090484 0.4801293532 0.9086295
7 extratrees 5 0.8115375 0.4801293532 0.9117530
7 gini 1 0.8320368 0.4980298507 0.9117553
7 gini 3 0.8327291 0.4980497512 0.9121459
7 gini 5 0.8324745 0.4880895522 0.9148788
10 extratrees 1 0.8092484 0.4860945274 0.9043342
10 extratrees 3 0.8095218 0.4821194030 0.9062865
10 extratrees 5 0.8116609 0.4771343284 0.9090186
10 gini 1 0.8305936 0.5099950249 0.9082427
10 gini 3 0.8315586 0.5149751244 0.9074607
10 gini 5 0.8328366 0.5119800995 0.9066787
13 extratrees 1 0.8077373 0.4900646766 0.9058959
[ reached getOption("max.print") -- omitted 5 rows ]
ROC was used to select the optimal model using the largest value.
The final values used for the model were mtry = 4, splitrule = gini and min.node.size = 1.
percent(roc_auc(df_test_pred_caret, truth, estimate)$.estimate)
[1] "82%"
print(engine_tidym)
Random Forest Model Specification (classification)
Main Arguments:
mtry = tune()
trees = tune()
min_n = tune()
Engine-Specific Arguments:
importance = impurity
Computational engine: ranger
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4713503 251.8 8741388 466.9 8357776 446.4
Vcells 12896293 98.4 23139468 176.6 23139468 176.6
gridy_tidym
# 5-fold cross-validation using stratification
(wkfl_tidym_best <- finalize_workflow(wkfl_tidym, grid_tidym_best))
== Workflow ====================================================================
[3mPreprocessor:[23m Recipe
[3mModel:[23m rand_forest()
-- Preprocessor ----------------------------------------------------------------
5 Recipe Steps
* step_unknown()
* step_medianimpute()
* step_other()
* step_novel()
* step_dummy()
-- Model -----------------------------------------------------------------------
Random Forest Model Specification (classification)
Main Arguments:
mtry = 3
trees = 983
min_n = 4
Engine-Specific Arguments:
importance = impurity
Computational engine: ranger
(wkfl_tidym_final <- last_fit(wkfl_tidym_best, split = split))
[33m![39m [33mResample1: model (predictions): Novel levels found in column 'home': NA. The leve...[39m
# Monte Carlo cross-validation (0.8/0.2) with 1 resamples
# Cross-validated training performance
percent(show_best(gridy_tidym, metric="roc_auc", n = 1)$mean)
[1] "83%"
# Test performance
percent(wkfl_tidym_final$.metrics[[1]]$.estimate[[2]])
[1] "81%"